/* Creating final version of firm, person-level HLFS, and person-level IDI data sets. */


#delimit ;
clear all;
local outfile "GWgap_postreadin_v4";
set more off;


di _n "$S_DATE $S_TIME";






*1*******************************************************************************;
* firm level data set of everything;
********************************************************************************;
/*
two samples all for 2002-2016;
* "all"
* at firms with (4) head count 5+, (2) non-missing and (3) good prody data, (1) non-missing ffe_m and ffe_f, (5) without missing ftes
contruct ct dropping out each step

Note there are two separate files, one for head count>=5 and one for smaller firms
*/


foreach sz in "" _hclt5 {;

	if "`sz'"=="" local size "head count>=5";
	if "`sz'"=="_hclt5" local size "head count<5";

	use GWgap_readin_v3b_firm`sz', clear;
	drop _merge;

	* restricting to years 2002-2016;
	keep if year>=2002 & year<=2016;
	foreach var in Yyear_2002 Yyear_2017 Yyear_2018 Yyear_2019  {;
		capture drop `var';
	};
	
	* merging on age breakdowns;
	merge 1:1 pent year using GWgap_readin_v3b_firm_age, keep(master match) nogen; 

	* merging on firm FE;
	merge m:1 pent using GWgap_2wFE_extract_v4_ffe_m, keep(master match) nogen keepus(ffe);
	rename ffe ffe_m;
	label var ffe_m "CCK firm FE calculated using males only";
	merge m:1 pent using GWgap_2wFE_extract_v4_ffe_f, keep(master match) nogen keepus(ffe);
	rename ffe ffe_f;
	label var ffe_f "CCK firm FE calculated using females only";
	merge m:1 pent using GWgap_2wFE_extract_v4_ffe_t, keep(master match) nogen keepus(ffe);
	label var ffe_t "CCK firm FE calculated using both genders";

	* making log interaction controls and scaling them;
	gen lnKsq = (lnK^2)/100;
	gen lnK_lnM = lnK*lnM/10;
	gen lnMsq = (lnM^2)/100;

	* making main sample indicators;
	gen Q_ffe = ffe_m<. & ffe_f<.;
	label var Q_ffe "Sample with non-missing male and female firm FE";

	gen Q_inprod = in_prody==1;
	label var Q_inprod "Sample in the productivity data";

	#delimit ;
	gen Q_gprod = Q_inprod==1 & o400_ch==0 & o999p==0 & lnM<. & lnK<. & lngo<.; 
	label var Q_gprod "Sample with good productivity data";
	notes Q_gprod: Good means none of K, M, or go changed more than 400% from the previous year, and
		none of them are zero or above their 99.9th percentile.;
		
	gen Q_hc5 = L_hc_t>=5 & L_hc_t<.;
	label var Q_hc5 "Indicator for firm has a head count of at least 5";
		
	gen Q_rest = Q_ffe==1 & Q_gprod==1 & Q_hc5==1 & av_fte_m<. & av_fte_f<. & pf_ind!=""
		& av_fte_lt25<. & av_fte_25to39<. & av_fte_40to54<. &  av_fte_55p<.;
	label var Q_rest "Indicator for restricted sample";
	notes Q_rest: Has male and female firm FE, very good productivity data with non-zero inputs, 
		and a head count of 5 or more, non-missing average ftes for males and females (set to
		1 if no such individuals), non-missing pf_ind industry, and a sufficiently high fraction
		of employees have non-missing age that age breakdown is not thrown out;

	compress;
	notes: GWgap_pr_firm`sz'_v4.dta was created by `outfile'.do on $S_DATE at $S_TIME. It is the
		main firm-level productivity data set for firms of size `size'. Use 
		Q_rest==1 to restrict to revised main restricted sample. Data are for years 2002-2016 only.;
	save GWgap_pr_firm`sz'_v4, replace;
};






*2*******************************************************************************;
* individual-level data set of people at their highest paying pents for IDI sample;

use GWgap_IDI_indivregdata, clear;

drop lnL_hc lnL_fte lnWpm in_prody_dat s_f tot_mon_wkd tot_fte_employee_av tot_gross_earn_yr indiv_avfte;

* restricting to years 2002-2016;
keep if year>=2002 & year<=2016;

* merging on firm fixed effects;

rename hp_pent pent;
merge m:1 pent using GWgap_2wFE_extract_v4_ffe_m, keep(master match) nogen keepus(ffe);
rename ffe ffe_m;
label var ffe_m "Firm fixed effect calculated from males only";
merge m:1 pent using GWgap_2wFE_extract_v4_ffe_f, keep(master match) nogen keepus(ffe);
rename ffe ffe_f;
label var ffe_f "Firm fixed effect calculated from females only";
merge m:1 pent using GWgap_2wFE_extract_v4_ffe_t, keep(master match) nogen keepus(ffe_t);
label var ffe_t "Firm fixed effect calculated from both genders";

* merging on worker fixed effects;

foreach g in m f {;
	merge m:1 snz_uid using GWgap_2wFE_extract_v4_wfe_`g', keep(master match) nogen keepus(wfe);
	rename wfe wfe_`g';
};
merge m:1 snz_uid using GWgap_2wFE_extract_v4_wfe_t, keep(master match) nogen keepus(wfe_t);
label var wfe_m "Worker fixed effect calculated from males only";
label var wfe_f "Worker fixed effect calculated from females only";
label var wfe_t "Worker fixed effect calculated from both genders";

* merging on sample variables;

merge m:1 pent year using GWgap_pr_firm_v4, keep(master match) keepus(Q_*) nogen;
rename Q_* Q5_*;
merge m:1 pent year using GWgap_pr_firm_hclt5_v4, keep(master match) keepus(Q_*) nogen;
rename pent hp_pent;

foreach var in ffe inprod gprod hc5 rest {;
	replace Q_`var' = Q5_`var' if Q_`var'==.;
	replace Q_`var' = 0 if substr(hp_pent,1,2)=="ZZ";
	replace Q_`var' = 0 if age==. | max_fte_employee_av==. | indiv_avfte_hp==.;
};
drop Q5_*;

* extra earnings variable;
replace max_gross_earn_yr = 500000 if max_gross_earn_yr>500000 & max_gross_earn_yr<.;
notes max_gross_earn_yr: Values above $500,000 are set to $500,000.;
gen lnIDI_earnhp = ln(max_gross_earn_yr);
label var lnIDI_earnhp "Log real annual earnings from highest-paying pent (IDI)";
notes lnIDI_earnhp: Values above $500,000 are set to $500,000.;

compress;
notes: GWgap_pr_IDI_v4.dta was created by `outfile'.do on $S_DATE at $S_TIME. It is an individual
level data set of highest paying pent each year 2002-16. Use Q_rest==1 to restrict to revised
main restricted sample.;
save GWgap_pr_IDI_v4, replace;








*3*******************************************************************************;
* individual-level data set of people at their highest paying pents for matched HLFS-IDI sample;


use GWgap_HLFS_indivregdata, clear;

drop tot_gross_earn_yr tot_fte_employee_av tot_mon_wkd in_prody_dat deflator 
	lnWpm indiv_avfte hc10 fte10 lnL_fte;
	
* merging on prody data;
rename hp_pent pent;
merge m:1 pent year using GWgap_readin_v3b_prody, keep(master match) keepus(go_nom M_nom K_nom 
	K_real lnK M_real lnM go_real lngo o400_ch o999p pf_ind) gen(prod_m);
label var pf_ind "Productivity industry of highest paying pent from IDI, firms in prody data only";

	
* restricting to years 2002-2016;
keep if year>=2002 & year<=2016;

* merging on sample variables;

merge m:1 pent year using GWgap_pr_firm_v4, keep(master match) keepus(Q_*);
rename Q_* Q5_*;
merge m:1 pent year using GWgap_pr_firm_hclt5_v4, keep(master match) keepus(Q_*) nogen;

foreach var in ffe inprod gprod hc5 rest {;
	replace Q_`var' = Q5_`var' if Q_`var'==.;
	replace Q_`var' = 0 if substr(pent,1,2)=="ZZ";
};
drop Q5_*;

* merging in ethnicity;
merge m:1 snz_uid using GWgap_HLFS_eth, keep(master match) nogen;

* merging in CCK firm fixed effects;
merge m:1 pent using GWgap_2wFE_extract_v4_ffe_m, keep(master match) keepus(ffe) nogen;
rename ffe hp_ffe_m;
label var hp_ffe_m "Firm fixed effect for highest paying pent estimated using men only"; 
merge m:1 pent using GWgap_2wFE_extract_v4_ffe_f, keep(master match) keepus(ffe) nogen;
rename ffe hp_ffe_f;
label var hp_ffe_f "Firm fixed effect for highest paying pent estimated using women only"; 
merge m:1 pent using GWgap_2wFE_extract_v4_ffe_t, keep(master match) keepus(ffe_t) nogen;
rename ffe_t hp_ffe_t;
label var hp_ffe_t "Firm fixed effect for highest paying pent estimated using both genders"; 
rename pent hp_pent;

* extra earnings variable;
replace max_gross_earn_yr = 500000 if max_gross_earn_yr>500000 & max_gross_earn_yr<.;
notes max_gross_earn_yr: Values above $500,000 are set to $500,000.;
gen lnIDI_earnhp = ln(max_gross_earn_yr);
label var lnIDI_earnhp "Log real annual earnings from highest-paying pent (IDI)";
notes lnIDI_earnhp: Values above $500,000 are set to $500,000.;

* other vars;
gen age2 = (age^2)/100;
label var age2 "Age squared (/100)";

gen hqual = hqual_to12_HLFS + hqual_fr13_HLFS;
label val hqual hqual;
label var hqual "Highest qualification";

recode par_u18_ct_HLFS (3/100 = 3), gen(numkid_u18_cat);
label define numkid 3 "3 or more";
label val numkid_u18_cat numkid;
label var numkid_u18_cat "Number of dependent children parented";

gen KL_rat = exp(lnK)/(L_hc);
replace KL_rat = 0 if K_nom==0;
replace KL_rat = KL_rat/100000;
label var KL_rat "K/L ratio (/100,000)";

gen val_ad_pw = (exp(lngo) - exp(lnM))/(L_hc);
replace val_ad_pw = val_ad_pw/10000;
label var val_ad_pw "Value added per worker (real $0,000s)";

label define age_cat  1 "Aged <25", modify;

label define hhcomp 1 "Household composition: Single, no children"
	9 "Unidentifiable household composition", modify;

label define numkid 0 "No children" 3 "3 or more children", modify;

gen hc5pl = exp(lnL_hc)>=5 & lnL_hc<.;
label var hc5pl "Highest paying pent has head count of 5 or more";

***** making indicators for non-missing controls; 

*** regression controls (with year interactions dropped);

local c1 "female max_fte_employee_av i.age_cat i.year";
local c2 "female hrs_main_HLFS age age2 i.year";
local c3 "`c2' i.hqual_to12_HLFS i.hqual_fr13_HLFS i.rc_HLFS i.hhcomp_HLFS i.numkid_u18_cat i.eth_gp"; 
local c4 "`c3' i.occ3d_HLFS"; 
local c5 "`c4' i.eth_gp ";
local c6 "`c4' wkd_hpp_1ya wkd_hpp_2ya lnL_hc i.ind4";
 
local c7 "`c4' wkd_hpp_1ya wkd_hpp_2ya lnL_hc KL_rat val_ad_pw i.ind4";
local c8 "`c6' hp_ffe_m";
local c9 "`c5' lnL_hc i.year  KL_rat ";


* making a variable for all regression controls non-missing for cols 1-6;

fvunab ctllistb: `c1' `c2' `c3' `c4' `c5' `c6';
local ctllistb2: subinstr local ctllistb "i." "", all;
di "ctllistb2: `ctllistb2'";

gen nonmsgb = 1;
foreach var of varlist `ctllistb2' {;
	replace nonmsgb = 0 if `var'==.;
};
label var nonmsgb "All controls used in 6 columns as non-missing";
replace female = . if nonmsgb==0;


* making a variable for all regression controls non-missing;

fvunab ctllist: `c1' `c2' `c3' `c4' `c5' `c6' `c7' `c8' `c9';
local ctllist2: subinstr local ctllist "i." "", all;
di "ctllist2: `ctllist2'";

gen nonmsg = 1;
foreach var of varlist `ctllist2' {;
	replace nonmsg = 0 if `var'==.;
};
label var nonmsg "All controls used in 9 columns as non-missing";

drop hqual_to12_HLFS hqual_fr13_HLFS;

gen Q_rest_HLFS = Q_rest==1 & nonmsg==1;
label var Q_rest_HLFS "In restricted sample and HLFS controls are non-missing";

compress;
notes: GWgap_pr_HLFS_v4.dta was created by `outfile'.do on $S_DATE at $S_TIME. It is an individual
level data set of highest paying pent each year 2002-16 for IDI-HLFS linked individuals. Use 
Q_rest_HLFS==1 to restrict to revised main restricted sample with non-missing HLFS covariates.;
save GWgap_pr_HLFS_v4, replace;





